# import libraries
import pandas as pd 
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import tensorflow as tf


# Create spark session
spark = SparkSession.builder.appName('Test').getOrCreate()


# Check version
spark


# Read CSV
df = spark.read.csv('crime.csv',header=True,inferSchema=True)


# Show
df.show()

+--------------------+----+-----+---+----+------+--------------------+------------------+---------+----------+-----------+------------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|     NEIGHBOURHOOD|        X|         Y|   Latitude|   Longitude|
+--------------------+----+-----+---+----+------+--------------------+------------------+---------+----------+-----------+------------+
|         Other Theft|2003|    5| 12|  16|    15|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|         Other Theft|2003|    5|  7|  15|    20|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|         Other Theft|2003|    4| 23|  16|    40|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|         Other Theft|2003|    4| 20|  11|    15|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|         Other Theft|2003|    4| 12|  17|    45|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|         Other Theft|2003|    3| 26|  20|    45|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|Break and Enter R...|2003|    3| 10|  12|     0|   63XX WILTSHIRE ST|        Kerrisdale|489325.58|5452817.95|49.22805078|-123.1466105|
|            Mischief|2003|    6| 28|   4|    13|     40XX W 19TH AVE| Dunbar-Southlands|485903.09|5455883.77|49.25555918|-123.1937252|
|         Other Theft|2003|    2| 16|   9|     2|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|Break and Enter R...|2003|    7|  9|  18|    15|      18XX E 3RD AVE|Grandview-Woodland|495078.19|5457221.38|49.26773386| -123.067654|
|         Other Theft|2003|    1| 31|  19|    45|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|            Mischief|2003|    9| 27|   1|     0|     40XX W 21ST AVE| Dunbar-Southlands|485852.96|5455684.11|49.25376204| -123.194407|
|Break and Enter R...|2003|    4| 19|  18|     0|      18XX E 3RD AVE|Grandview-Woodland|495093.69|5457230.31|49.26781432|-123.0674411|
|Break and Enter R...|2003|    9| 24|  18|    30|      18XX E 3RD AVE|Grandview-Woodland|495103.82|5457221.02|49.26773083|-123.0673017|
|Break and Enter R...|2003|   11|  5|   8|    12|     63XX WINDSOR ST|            Sunset|493790.48| 5452630.9|49.22642977|-123.0852834|
|Break and Enter C...|2003|    9| 26|   2|    30|     10XX ALBERNI ST|          West End|491067.65|5459114.22|49.28471484|-123.1228242|
|Break and Enter R...|2003|   10| 21|  10|     0|      18XX E 3RD AVE|Grandview-Woodland|495119.32|5457229.95|49.26781128|-123.0670888|
|         Other Theft|2003|    1| 25|  12|    30|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
|Offence Against a...|2003|    2| 12|null|  null|OFFSET TO PROTECT...|              null|      0.0|       0.0|        0.0|         0.0|
|         Other Theft|2003|    1|  9|   6|    45|    9XX TERMINAL AVE|        Strathcona| 493906.5|5457452.47|49.26980201|-123.0837633|
+--------------------+----+-----+---+----+------+--------------------+------------------+---------+----------+-----------+------------+
only showing top 20 rows


# look at Schema
df.printSchema()

root
 |-- TYPE: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- HOUR: integer (nullable = true)
 |-- MINUTE: integer (nullable = true)
 |-- HUNDRED_BLOCK: string (nullable = true)
 |-- NEIGHBOURHOOD: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)


# Create a class Drop_1 to relete rows with of duplicates, N/A's, and 0's in certain columns
class Drop_1():

    def __init__(self):
        pass

    def drop(self, data):
        data1 = data.dropDuplicates()
        data2 = data1.dropna()
        data3 = data2.filter((data2['X'] != 0) | (data2['Y'] != 0) | (data2['Latitude'] != 0) | (data2['Longitude'] != 0))
        return data3


# Cast columns as different types
class Caster():

    def __init__(self):
        pass

    def cast(self, data):
        data1 = data.withColumn("X_float", data["X"].cast("float"))
        data2 = data1.withColumn("Y_float", data1["Y"].cast("float"))     
        data3 = data2.withColumn("Z", (data2['X_float'] / data2['Y_float']))
        data4 = data3.withColumn("Year_int", data3["YEAR"].cast("int"))
        data5 = data4.withColumn("Month_int", data4["MONTH"].cast("int"))
        data6 = data5.withColumn("Day_int", data5["DAY"].cast("int"))
        data7 = data6.withColumn("Hour_int", data6["HOUR"].cast("int"))
        data8 = data7.withColumn("Minute_int", data7["MINUTE"].cast("int"))
        data9 = data8.withColumn("Lat_float", data8["Latitude"].cast("float"))
        data10 = data9.withColumn("Long_float", data9["Longitude"].cast("float"))
        return data10


# Drop_2 class drops a list of columns
class Drop_2():

    def __init__(self):
        pass
    def drop(self, data, input_cols):
        data1 = data.drop(*input_cols)
        return data1


# OneHotEncoderTransformer one hot encodes a list of columns
from pyspark.ml.feature import OneHotEncoder, StringIndexer
class OneHotEncoderTransformer:
    def __init__(self):
        pass

    def one_hot_encode(self, data, input_cols):
        indexer = StringIndexer(inputCols=input_cols, outputCols=[col+"_index" for col in input_cols])
        indexed_data = indexer.fit(data).transform(data)
        encoder = OneHotEncoder(inputCols=[col+"_index" for col in input_cols],
        outputCols=[col+"_vec" for col in input_cols])
        encoded_data = encoder.fit(indexed_data).transform(indexed_data)
        return encoded_data


# train_test_split class accepts split size (split), label column (column_y), set_seed
# For creating training/test split
class train_test_split():

    def __init__(self):
        pass

    def split(self, data, split, column_y, set_seed):
        train_weight = split
        test_weight = 1 - train_weight
        train_df, test_df = data.randomSplit([train_weight, test_weight], seed=set_seed)
        train_df_x = train_df.drop(column_y)
        train_df_y = train_df.select(column_y)
        test_df_x = test_df.drop(column_y)
        test_df_y = test_df.select(column_y)
        return train_df_x, train_df_y, test_df_x, test_df_y


# Create an instance of each class
dropper_1 = Drop_1()
caster = Caster()
dropper_2 = Drop_2()
one_hot = OneHotEncoderTransformer()
splitter = train_test_split()


# Pass dataframe through all the classes/methods sequentially
df1 = dropper_1.drop(df)
df2 = caster.cast(df1)
df3 = dropper_2.drop(df2,['HUNDRED_BLOCK'])
df4 = one_hot.one_hot_encode(df3,['TYPE','NEIGHBOURHOOD'])
df5 = dropper_2.drop(df4,['TYPE_vec','NEIGHBOURHOOD_vec','TYPE','NEIGHBOURHOOD','YEAR','MONTH','DAY','HOUR','MINUTE','X','Y','Latitude','Longitude'])
train_df_x, train_df_y, test_df_x, test_df_y = splitter.split(df5,0.8,'TYPE_index',42)


# Check number of rows
print("Number of rows in Train X: ", train_df_x.count())
print("Number of rows in Train Y: ", train_df_y.count())
print("Number of rows in Test X: ", test_df_x.count())
print("Number of rows in Test Y: ", test_df_y.count())

Number of rows in Train X:  379175
Number of rows in Train Y:  379175
Number of rows in Test X:  94839
Number of rows in Test Y:  94839


# Check Schema
print("Schema of Train X: ", train_df_x.printSchema())
print("Schema of Train Y: ", train_df_y.printSchema())
print("Schema of Test X: ", test_df_x.printSchema())
print("Schema of Test Y: ", test_df_y.printSchema())

root
 |-- X_float: float (nullable = true)
 |-- Y_float: float (nullable = true)
 |-- Z: double (nullable = true)
 |-- Year_int: integer (nullable = true)
 |-- Month_int: integer (nullable = true)
 |-- Day_int: integer (nullable = true)
 |-- Hour_int: integer (nullable = true)
 |-- Minute_int: integer (nullable = true)
 |-- Lat_float: float (nullable = true)
 |-- Long_float: float (nullable = true)
 |-- NEIGHBOURHOOD_index: double (nullable = false)

Schema of Train X:  None
root
 |-- TYPE_index: double (nullable = false)

Schema of Train Y:  None
root
 |-- X_float: float (nullable = true)
 |-- Y_float: float (nullable = true)
 |-- Z: double (nullable = true)
 |-- Year_int: integer (nullable = true)
 |-- Month_int: integer (nullable = true)
 |-- Day_int: integer (nullable = true)
 |-- Hour_int: integer (nullable = true)
 |-- Minute_int: integer (nullable = true)
 |-- Lat_float: float (nullable = true)
 |-- Long_float: float (nullable = true)
 |-- NEIGHBOURHOOD_index: double (nullable = false)

Schema of Test X:  None
root
 |-- TYPE_index: double (nullable = false)

Schema of Test Y:  None


# Do the same thing as above, but creating classes that inhereit from "Transformer" class in pyspark.ml
# Within the __init__ method, the super() function is used to call the __init__ method of the parent class (Transformer).
from pyspark.ml import Transformer
from pyspark.sql.functions import col

class Drop_1(Transformer):
    
    def __init__(self):
        super(Drop_1, self).__init__()
    
    def _transform(self, data):
        data1 = data.dropDuplicates()
        data2 = data1.dropna()
        data3 = data2.filter((col('X') != 0) | (col('Y') != 0) | (col('Latitude') != 0) | (col('Longitude') != 0))
        return data3


class Caster(Transformer):
    
    def __init__(self):
        super(Caster, self).__init__()
    
    def _transform(self, data):
        data1 = data.withColumn('X_float', col('X').cast('float'))
        data2 = data1.withColumn('Y_float', col('Y').cast('float'))
        data3 = data2.withColumn('Z', col('X_float') / col('Y_float'))
        data4 = data3.withColumn('Year_int', col('YEAR').cast('int'))
        data5 = data4.withColumn('Month_int', col('MONTH').cast('int'))
        data6 = data5.withColumn('Day_int', col('DAY').cast('int'))
        data7 = data6.withColumn('Hour_int', col('HOUR').cast('int'))
        data8 = data7.withColumn('Minute_int', col('MINUTE').cast('int'))
        data9 = data8.withColumn('Lat_float', col('Latitude').cast('float'))
        data10 = data9.withColumn('Long_float', col('Longitude').cast('float'))
        return data10


class Drop_2(Transformer):
    
    def __init__(self, input_cols):
        super(Drop_2, self).__init__()
        self.input_cols = input_cols
        
    def _transform(self, data):
        data1 = data.drop(*self.input_cols)
        return data1


from pyspark.ml.feature import OneHotEncoder, StringIndexer


class OneHotEncoderTransformer(Transformer):
    
    def __init__(self, input_cols):
        super(OneHotEncoderTransformer, self).__init__()
        self.input_cols = input_cols
        
    def _transform(self, data):
        indexer = StringIndexer(inputCols=self.input_cols, outputCols=[col+"_index" for col in self.input_cols])
        indexed_data = indexer.fit(data).transform(data)
        encoder = OneHotEncoder(inputCols=[col+"_index" for col in self.input_cols],
        outputCols=[col+"_vec" for col in self.input_cols])
        encoded_data = encoder.fit(indexed_data).transform(indexed_data)
        return encoded_data


class SplitTransformer(Transformer):
    
    def __init__(self, split, column_y, set_seed):
        super(SplitTransformer, self).__init__()
        self.split = split
        self.column_y = column_y
        self.set_seed = set_seed
        
    def _transform(self, data):
        train_weight = self.split
        test_weight = 1 - train_weight
        train_df, test_df = data.randomSplit([train_weight, test_weight], seed=self.set_seed)
        train_df_x = train_df.drop(self.column_y)
        train_df_y = train_df.select(self.column_y)
        test_df_x = test_df.drop(self.column_y)
        test_df_y = test_df.select(self.column_y)
        return train_df_x, train_df_y, test_df_x, test_df_y


# Import Pipeline and create instance of each class
from pyspark.ml import Pipeline
dropper_1 = Drop_1()
caster = Caster()
dropper_2 = Drop_2(input_cols=['HUNDRED_BLOCK'])
one_hot = OneHotEncoderTransformer(input_cols=['TYPE','NEIGHBOURHOOD'])
dropper_3 = Drop_2(input_cols=['TYPE_vec','NEIGHBOURHOOD_vec','TYPE','NEIGHBOURHOOD','YEAR','MONTH','DAY','HOUR','MINUTE','X','Y','Latitude','Longitude'])
splitter = SplitTransformer(split=0.8,column_y='TYPE_index',set_seed=42)


# Define our stages, then use the Pipeline function, then use pipeline.fit, then use model.transform
stages = [dropper_1,caster,dropper_2,one_hot,dropper_3,splitter]
pipeline = Pipeline(stages=stages)
model = pipeline.fit(df)
train_df_x, train_df_y, test_df_x, test_df_y = model.transform(df)


# Check number of rows
print("Number of rows in Train X: ", train_df_x.count())
print("Number of rows in Train Y: ", train_df_y.count())
print("Number of rows in Test X: ", test_df_x.count())
print("Number of rows in Test Y: ", test_df_y.count())

Number of rows in Train X:  379175
Number of rows in Train Y:  379175
Number of rows in Test X:  94839
Number of rows in Test Y:  94839


# Check Schema
print("Schema of Train X: ", train_df_x.printSchema())
print("Schema of Train Y: ", train_df_y.printSchema())
print("Schema of Test X: ", test_df_x.printSchema())
print("Schema of Test Y: ", test_df_y.printSchema())

root
 |-- X_float: float (nullable = true)
 |-- Y_float: float (nullable = true)
 |-- Z: double (nullable = true)
 |-- Year_int: integer (nullable = true)
 |-- Month_int: integer (nullable = true)
 |-- Day_int: integer (nullable = true)
 |-- Hour_int: integer (nullable = true)
 |-- Minute_int: integer (nullable = true)
 |-- Lat_float: float (nullable = true)
 |-- Long_float: float (nullable = true)
 |-- NEIGHBOURHOOD_index: double (nullable = false)

Schema of Train X:  None
root
 |-- TYPE_index: double (nullable = false)

Schema of Train Y:  None
root
 |-- X_float: float (nullable = true)
 |-- Y_float: float (nullable = true)
 |-- Z: double (nullable = true)
 |-- Year_int: integer (nullable = true)
 |-- Month_int: integer (nullable = true)
 |-- Day_int: integer (nullable = true)
 |-- Hour_int: integer (nullable = true)
 |-- Minute_int: integer (nullable = true)
 |-- Lat_float: float (nullable = true)
 |-- Long_float: float (nullable = true)
 |-- NEIGHBOURHOOD_index: double (nullable = false)

Schema of Test X:  None
root
 |-- TYPE_index: double (nullable = false)

Schema of Test Y:  None


# Define your data file path
file_path = "crime.csv"


# Define your dataset
# record_defaults=[''] * 12  creates a list of 12 empty strings. This indicates that all fields in the CSV file should be treated as strings, 
# and any missing or invalid values should be replaced with an empty string.
# na_value='N/A' tells CsvDataset to treat the string 'N/A' as a missing or invalid value, and 
# replace it with the default value specified in the record_defaults parameter.
dataset = tf.data.experimental.CsvDataset(
    filenames=file_path,
    record_defaults=[''] * 12,
    header=True,
    field_delim=",",
    na_value='N/A'
)


# Can go through the first n elements, using .take method
for element in dataset.take(20):
    print(element)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'12'>, <tf.Tensor: shape=(), dtype=string, numpy=b'16'>, <tf.Tensor: shape=(), dtype=string, numpy=b'15'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'7'>, <tf.Tensor: shape=(), dtype=string, numpy=b'15'>, <tf.Tensor: shape=(), dtype=string, numpy=b'20'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=string, numpy=b'23'>, <tf.Tensor: shape=(), dtype=string, numpy=b'16'>, <tf.Tensor: shape=(), dtype=string, numpy=b'40'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=string, numpy=b'20'>, <tf.Tensor: shape=(), dtype=string, numpy=b'11'>, <tf.Tensor: shape=(), dtype=string, numpy=b'15'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=string, numpy=b'12'>, <tf.Tensor: shape=(), dtype=string, numpy=b'17'>, <tf.Tensor: shape=(), dtype=string, numpy=b'45'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'3'>, <tf.Tensor: shape=(), dtype=string, numpy=b'26'>, <tf.Tensor: shape=(), dtype=string, numpy=b'20'>, <tf.Tensor: shape=(), dtype=string, numpy=b'45'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Residential/Other'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'3'>, <tf.Tensor: shape=(), dtype=string, numpy=b'10'>, <tf.Tensor: shape=(), dtype=string, numpy=b'12'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'63XX WILTSHIRE ST'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Kerrisdale'>, <tf.Tensor: shape=(), dtype=string, numpy=b'489325.58'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5452817.95'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.22805078'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.1466105'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Mischief'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'6'>, <tf.Tensor: shape=(), dtype=string, numpy=b'28'>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=string, numpy=b'13'>, <tf.Tensor: shape=(), dtype=string, numpy=b'40XX W 19TH AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Dunbar-Southlands'>, <tf.Tensor: shape=(), dtype=string, numpy=b'485903.09'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5455883.77'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.25555918'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.1937252'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2'>, <tf.Tensor: shape=(), dtype=string, numpy=b'16'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Residential/Other'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'7'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18'>, <tf.Tensor: shape=(), dtype=string, numpy=b'15'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18XX E 3RD AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Grandview-Woodland'>, <tf.Tensor: shape=(), dtype=string, numpy=b'495078.19'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457221.38'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26773386'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.067654'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1'>, <tf.Tensor: shape=(), dtype=string, numpy=b'31'>, <tf.Tensor: shape=(), dtype=string, numpy=b'19'>, <tf.Tensor: shape=(), dtype=string, numpy=b'45'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Mischief'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'27'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'40XX W 21ST AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Dunbar-Southlands'>, <tf.Tensor: shape=(), dtype=string, numpy=b'485852.96'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5455684.11'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.25376204'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.194407'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Residential/Other'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=string, numpy=b'19'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18XX E 3RD AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Grandview-Woodland'>, <tf.Tensor: shape=(), dtype=string, numpy=b'495093.69'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457230.31'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26781432'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0674411'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Residential/Other'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'24'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18'>, <tf.Tensor: shape=(), dtype=string, numpy=b'30'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18XX E 3RD AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Grandview-Woodland'>, <tf.Tensor: shape=(), dtype=string, numpy=b'495103.82'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457221.02'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26773083'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0673017'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Residential/Other'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'11'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'8'>, <tf.Tensor: shape=(), dtype=string, numpy=b'12'>, <tf.Tensor: shape=(), dtype=string, numpy=b'63XX WINDSOR ST'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Sunset'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493790.48'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5452630.9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.22642977'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0852834'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Commercial'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'26'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2'>, <tf.Tensor: shape=(), dtype=string, numpy=b'30'>, <tf.Tensor: shape=(), dtype=string, numpy=b'10XX ALBERNI ST'>, <tf.Tensor: shape=(), dtype=string, numpy=b'West End'>, <tf.Tensor: shape=(), dtype=string, numpy=b'491067.65'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5459114.22'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.28471484'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.1228242'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Break and Enter Residential/Other'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'10'>, <tf.Tensor: shape=(), dtype=string, numpy=b'21'>, <tf.Tensor: shape=(), dtype=string, numpy=b'10'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'18XX E 3RD AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Grandview-Woodland'>, <tf.Tensor: shape=(), dtype=string, numpy=b'495119.32'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457229.95'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26781128'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0670888'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1'>, <tf.Tensor: shape=(), dtype=string, numpy=b'25'>, <tf.Tensor: shape=(), dtype=string, numpy=b'12'>, <tf.Tensor: shape=(), dtype=string, numpy=b'30'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Offence Against a Person'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2'>, <tf.Tensor: shape=(), dtype=string, numpy=b'12'>, <tf.Tensor: shape=(), dtype=string, numpy=b''>, <tf.Tensor: shape=(), dtype=string, numpy=b''>, <tf.Tensor: shape=(), dtype=string, numpy=b'OFFSET TO PROTECT PRIVACY'>, <tf.Tensor: shape=(), dtype=string, numpy=b''>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>, <tf.Tensor: shape=(), dtype=string, numpy=b'0'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Other Theft'>, <tf.Tensor: shape=(), dtype=string, numpy=b'2003'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9'>, <tf.Tensor: shape=(), dtype=string, numpy=b'6'>, <tf.Tensor: shape=(), dtype=string, numpy=b'45'>, <tf.Tensor: shape=(), dtype=string, numpy=b'9XX TERMINAL AVE'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Strathcona'>, <tf.Tensor: shape=(), dtype=string, numpy=b'493906.5'>, <tf.Tensor: shape=(), dtype=string, numpy=b'5457452.47'>, <tf.Tensor: shape=(), dtype=string, numpy=b'49.26980201'>, <tf.Tensor: shape=(), dtype=string, numpy=b'-123.0837633'>)


# Can use .as_numpy_iterator() to see more clearly
for element in dataset.take(20).as_numpy_iterator():
    print(element)

(b'Other Theft', b'2003', b'5', b'12', b'16', b'15', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'5', b'7', b'15', b'20', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'4', b'23', b'16', b'40', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'4', b'20', b'11', b'15', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'4', b'12', b'17', b'45', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'3', b'26', b'20', b'45', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Break and Enter Residential/Other', b'2003', b'3', b'10', b'12', b'0', b'63XX WILTSHIRE ST', b'Kerrisdale', b'489325.58', b'5452817.95', b'49.22805078', b'-123.1466105')
(b'Mischief', b'2003', b'6', b'28', b'4', b'13', b'40XX W 19TH AVE', b'Dunbar-Southlands', b'485903.09', b'5455883.77', b'49.25555918', b'-123.1937252')
(b'Other Theft', b'2003', b'2', b'16', b'9', b'2', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Break and Enter Residential/Other', b'2003', b'7', b'9', b'18', b'15', b'18XX E 3RD AVE', b'Grandview-Woodland', b'495078.19', b'5457221.38', b'49.26773386', b'-123.067654')
(b'Other Theft', b'2003', b'1', b'31', b'19', b'45', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Mischief', b'2003', b'9', b'27', b'1', b'0', b'40XX W 21ST AVE', b'Dunbar-Southlands', b'485852.96', b'5455684.11', b'49.25376204', b'-123.194407')
(b'Break and Enter Residential/Other', b'2003', b'4', b'19', b'18', b'0', b'18XX E 3RD AVE', b'Grandview-Woodland', b'495093.69', b'5457230.31', b'49.26781432', b'-123.0674411')
(b'Break and Enter Residential/Other', b'2003', b'9', b'24', b'18', b'30', b'18XX E 3RD AVE', b'Grandview-Woodland', b'495103.82', b'5457221.02', b'49.26773083', b'-123.0673017')
(b'Break and Enter Residential/Other', b'2003', b'11', b'5', b'8', b'12', b'63XX WINDSOR ST', b'Sunset', b'493790.48', b'5452630.9', b'49.22642977', b'-123.0852834')
(b'Break and Enter Commercial', b'2003', b'9', b'26', b'2', b'30', b'10XX ALBERNI ST', b'West End', b'491067.65', b'5459114.22', b'49.28471484', b'-123.1228242')
(b'Break and Enter Residential/Other', b'2003', b'10', b'21', b'10', b'0', b'18XX E 3RD AVE', b'Grandview-Woodland', b'495119.32', b'5457229.95', b'49.26781128', b'-123.0670888')
(b'Other Theft', b'2003', b'1', b'25', b'12', b'30', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Offence Against a Person', b'2003', b'2', b'12', b'', b'', b'OFFSET TO PROTECT PRIVACY', b'', b'0', b'0', b'0', b'0')
(b'Other Theft', b'2003', b'1', b'9', b'6', b'45', b'9XX TERMINAL AVE', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')


# Define a class that drops unwanted rows
class Drop_1():
    def __init__(self):
        pass

    def drop(self, data):
        # Drops 6th column, by only remapping column 0-5 and column 7-11
        data = data.map(lambda *record: (record[0], record[1], record[2], record[3], record[4], record[5], record[7], record[8], record[9], record[10], record[11]))
        # uses filter method to only get values not equal to empty string
        data = data.filter(lambda *record: tf.reduce_all(tf.not_equal(record, "")))
        # Drops all rows with in columns 7-10 with a 0 in them
        data = data.filter(lambda *record: record[7] != '0' or record[8] != '0' or record[9] != '0' or record[10] != '0')
        # Combine multiple columns into a single component using the map method
        data = data.map(lambda *x: tf.strings.join(x, separator='|'))
        # Get distinct rows using the tf.data.experimental.unique function
        data = data.apply(tf.data.experimental.unique())
        # Split the single component back into multiple columns using the map method
        data = data.map(lambda x: tf.strings.split(x, sep='|'))
        # Split the single tensor back into individual tensors, num=number expected columns
        data = data.map(lambda x: tf.unstack(x, num=11))
        return data


# This class casts columm 1-5 as integer, and column 7-10 as float.
# It keeps column 0 and 6 the same, and create a new column 11 as column 7/column 8
class Caster():
    def __init__(self):
        pass

    def cast(self, data):
        dataset = data.map(lambda *x: (x[0],
                                      tf.strings.to_number(x[1], tf.int32),
                                      tf.strings.to_number(x[2], tf.int32),
                                      tf.strings.to_number(x[3], tf.int32),
                                      tf.strings.to_number(x[4], tf.int32),
                                      tf.strings.to_number(x[5], tf.int32),
                                      x[6],
                                      tf.strings.to_number(x[7], tf.float32),
                                      tf.strings.to_number(x[8], tf.float32),
                                      tf.strings.to_number(x[9], tf.float32),
                                      tf.strings.to_number(x[10], tf.float32),
                                      tf.divide(tf.strings.to_number(x[7], tf.float32), tf.strings.to_number(x[8], tf.float32))
                                     ))
        return dataset


# This class takes col_1 and col_2 as attributes, then uses it in the function
# The function finds the distinct entries in column, then uses tf.feature_column.categorical_column_with_vocabulary_list
# method and it is then turned into an indicator column. Then it uses the tf.keras.layers.DenseFeatures method to create
# the Feature layer. Then it is mapped back to the original data on the right columns.

class One_Hot_Encoder:
    def __init__(self,col_1,col_2):
        self.col_1 = col_1
        self.col_2 = col_2
        
    def one_hot(self,data):
        column_1 = data.map(lambda *x: x[self.col_1])
        unique_elements_1 = column_1.apply(tf.data.experimental.unique())
        unique_list_1 = []
        for element in unique_elements_1:
            unique_list_1.append(element.numpy())
        categorical_column_1 = tf.feature_column.categorical_column_with_vocabulary_list(
        key='type',vocabulary_list=unique_list_1)
        indicator_column_1 = tf.feature_column.indicator_column(categorical_column_1)
        feature_layer_1 = tf.keras.layers.DenseFeatures(indicator_column_1)

        column_2 = data.map(lambda *x: x[self.col_2])
        unique_elements_2 = column_2.apply(tf.data.experimental.unique())
        unique_list_2 = []
        for element in unique_elements_2:
            unique_list_2.append(element.numpy())
        categorical_column_2 = tf.feature_column.categorical_column_with_vocabulary_list(key='neighbourhood',
        vocabulary_list=unique_list_2)
        indicator_column_2 = tf.feature_column.indicator_column(categorical_column_2)
        feature_layer_2 = tf.keras.layers.DenseFeatures(indicator_column_2)

        dataset = data.map(lambda *x: (
            feature_layer_1({'type': tf.reshape(x[0], shape=(1,))}),
            x[1], x[2], x[3], x[4], x[5],
            feature_layer_2({'neighbourhood': tf.reshape(x[6], shape=(1,))}),
            x[7], x[8], x[9], x[10], x[11]
        ))
        return dataset


# This function takes in an arguements for split_size, buffer_size, label_index, train_batch, test_batch
# split_size is the train/test split size, i.e: 0.8
# buffer_size the size of the buffer used for shuffling the dataset.
# label_index is the index for the label column
# train_batch is the batch size for training data
# test_batch is the batch size for testing data

class train_test_split:
    def __init__(self):
        pass

    def split(self, data, split_size,buffer_size, label_index, train_batch, test_batch):
        # Finds size of dataset
        dataset_size = data.reduce(0, lambda x, _: x + 1 if isinstance(_, (list, tuple)) else x).numpy()
        # Defines train size
        train_size = int(split_size * dataset_size)
        # Take the first train_size datapoints
        train_dataset = data.take(train_size)
        # Test Data is just skipping the train_size datapoints
        test_dataset = data.skip(train_size)
        # shuffles the test data
        train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
        # batches the data
        train_dataset = train_dataset.batch(batch_size=train_batch)
        test_dataset = test_dataset.batch(batch_size=test_batch)
        # used to buffer number elements from dataset, tf.data.AUTOTUNE automatically adjusts buffer size dynamically
        train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
        test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
        # Splits data into features and labels
        train_features = train_dataset.map(lambda *x: x[:label_index]+x[label_index+1:])
        train_labels = train_dataset.map(lambda *x: x[label_index])
        test_features = test_dataset.map(lambda *x: x[:label_index]+x[label_index+1:])
        test_labels = test_dataset.map(lambda *x: x[label_index])
        # returns tuple of 4 data sets
        return train_features, train_labels, test_features, test_labels


# Create instance of each class
Dropper_1 = Drop_1() 
Caster_1 = Caster()
one_hot_encoder = One_Hot_Encoder(col_1=0,col_2=6)
Splitter =  train_test_split()


# Use the methods
dataset_1 = Dropper_1.drop(dataset)
dataset_2 = Caster_1.cast(dataset_1)
dataset_3 = one_hot_encoder.one_hot(dataset_2)
train_x, train_y, test_x, test_y = Splitter.split(dataset_3, split_size=0.8,buffer_size=1000, label_index=0,train_batch=1,test_batch=1)


# Look at dataset after dropping unwanted rows
for element in dataset_1.take(10).as_numpy_iterator():
    print(element)

(b'Other Theft', b'2003', b'5', b'12', b'16', b'15', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'5', b'7', b'15', b'20', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'4', b'23', b'16', b'40', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'4', b'20', b'11', b'15', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'4', b'12', b'17', b'45', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Other Theft', b'2003', b'3', b'26', b'20', b'45', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Break and Enter Residential/Other', b'2003', b'3', b'10', b'12', b'0', b'Kerrisdale', b'489325.58', b'5452817.95', b'49.22805078', b'-123.1466105')
(b'Mischief', b'2003', b'6', b'28', b'4', b'13', b'Dunbar-Southlands', b'485903.09', b'5455883.77', b'49.25555918', b'-123.1937252')
(b'Other Theft', b'2003', b'2', b'16', b'9', b'2', b'Strathcona', b'493906.5', b'5457452.47', b'49.26980201', b'-123.0837633')
(b'Break and Enter Residential/Other', b'2003', b'7', b'9', b'18', b'15', b'Grandview-Woodland', b'495078.19', b'5457221.38', b'49.26773386', b'-123.067654')


# Look at dataset after casting and making new column
for element in dataset_2.take(10).as_numpy_iterator():
    print(element)

(b'Other Theft', 2003, 5, 12, 16, 15, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Other Theft', 2003, 5, 7, 15, 20, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Other Theft', 2003, 4, 23, 16, 40, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Other Theft', 2003, 4, 20, 11, 15, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Other Theft', 2003, 4, 12, 17, 45, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Other Theft', 2003, 3, 26, 20, 45, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Break and Enter Residential/Other', 2003, 3, 10, 12, 0, b'Kerrisdale', 489325.6, 5452818.0, 49.22805, -123.146614, 0.089738116)
(b'Mischief', 2003, 6, 28, 4, 13, b'Dunbar-Southlands', 485903.1, 5455884.0, 49.255558, -123.193726, 0.08906038)
(b'Other Theft', 2003, 2, 16, 9, 2, b'Strathcona', 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(b'Break and Enter Residential/Other', 2003, 7, 9, 18, 15, b'Grandview-Woodland', 495078.2, 5457221.5, 49.267735, -123.06766, 0.09071983)


# Look at dataset after one hot encoding
for element in dataset_3.take(10).as_numpy_iterator():
    print(element)

(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 5, 12, 16, 15, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 5, 7, 15, 20, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 4, 23, 16, 40, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 4, 20, 11, 15, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 4, 12, 17, 45, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 3, 26, 20, 45, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[0., 1., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 3, 10, 12, 0, array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 489325.6, 5452818.0, 49.22805, -123.146614, 0.089738116)
(array([[0., 0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 6, 28, 4, 13, array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 485903.1, 5455884.0, 49.255558, -123.193726, 0.08906038)
(array([[1., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 2, 16, 9, 2, array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 493906.5, 5457452.5, 49.269802, -123.08376, 0.09050129)
(array([[0., 1., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 2003, 7, 9, 18, 15, array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32), 495078.2, 5457221.5, 49.267735, -123.06766, 0.09071983)


# Look at data after train_test_split
# Train data
for element in train_x.take(5).as_numpy_iterator():
    print("train_x:",element,"\n")
for element in train_y.take(5).as_numpy_iterator():
    print("train_y",element,"\n")  
num_rows = train_x.reduce(0, lambda x, _: x + 1 if isinstance(_, (list, tuple)) else x).numpy()
print("train_x size:",num_rows)

train_x: (array([2003]), array([6]), array([16]), array([21]), array([0]), array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([490768.06], dtype=float32), array([5458359.5], dtype=float32), array([49.277924], dtype=float32), array([-123.12692], dtype=float32), array([0.08991127], dtype=float32)) 

train_x: (array([2003]), array([11]), array([3]), array([10]), array([0]), array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([488437.], dtype=float32), array([5452725.], dtype=float32), array([49.2272], dtype=float32), array([-123.15881], dtype=float32), array([0.08957668], dtype=float32)) 

train_x: (array([2003]), array([2]), array([24]), array([21]), array([35]), array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([490670.44], dtype=float32), array([5458416.5], dtype=float32), array([49.278435], dtype=float32), array([-123.12827], dtype=float32), array([0.08989245], dtype=float32)) 

train_x: (array([2003]), array([4]), array([21]), array([10]), array([0]), array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0.]]], dtype=float32), array([491601.], dtype=float32), array([5455148.], dtype=float32), array([49.249046], dtype=float32), array([-123.11541], dtype=float32), array([0.0901169], dtype=float32)) 

train_x: (array([2003]), array([11]), array([8]), array([22]), array([15]), array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([491191.2], dtype=float32), array([5458128.], dtype=float32), array([49.275845], dtype=float32), array([-123.1211], dtype=float32), array([0.08999261], dtype=float32)) 

train_y [[[0. 0. 0. 1. 0. 0. 0. 0. 0.]]] 

train_y [[[0. 0. 0. 0. 1. 0. 0. 0. 0.]]] 

train_y [[[0. 1. 0. 0. 0. 0. 0. 0. 0.]]] 

train_y [[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]] 

train_y [[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]] 

train_x size: 379218


# Test data
for element in test_x.take(5).as_numpy_iterator():
    print("test_x:",element,"\n")
for element in test_y.take(5).as_numpy_iterator():
    print("test_y",element,"\n")  
num_rows = test_x.reduce(0, lambda x, _: x + 1 if isinstance(_, (list, tuple)) else x).numpy()
print("test_x size:",num_rows)

test_x: (array([2014]), array([6]), array([10]), array([23]), array([0]), array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([489974.1], dtype=float32), array([5456460.5], dtype=float32), array([49.260826], dtype=float32), array([-123.137794], dtype=float32), array([0.08979706], dtype=float32)) 

test_x: (array([2014]), array([3]), array([15]), array([18]), array([0]), array([[[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([492606.22], dtype=float32), array([5453016.], dtype=float32), array([49.22988], dtype=float32), array([-123.101555], dtype=float32), array([0.09033647], dtype=float32)) 

test_x: (array([2014]), array([3]), array([30]), array([13]), array([23]), array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([491511.62], dtype=float32), array([5459023.5], dtype=float32), array([49.283905], dtype=float32), array([-123.116714], dtype=float32), array([0.09003655], dtype=float32)) 

test_x: (array([2014]), array([4]), array([2]), array([16]), array([4]), array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([491511.62], dtype=float32), array([5459023.5], dtype=float32), array([49.283905], dtype=float32), array([-123.116714], dtype=float32), array([0.09003655], dtype=float32)) 

test_x: (array([2014]), array([4]), array([2]), array([16]), array([45]), array([[[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=float32), array([491511.62], dtype=float32), array([5459023.5], dtype=float32), array([49.283905], dtype=float32), array([-123.116714], dtype=float32), array([0.09003655], dtype=float32)) 

test_y [[[0. 0. 0. 0. 1. 0. 0. 0. 0.]]] 

test_y [[[0. 1. 0. 0. 0. 0. 0. 0. 0.]]] 

test_y [[[1. 0. 0. 0. 0. 0. 0. 0. 0.]]] 

test_y [[[1. 0. 0. 0. 0. 0. 0. 0. 0.]]] 

test_y [[[1. 0. 0. 0. 0. 0. 0. 0. 0.]]] 

test_x size: 94805